package com.liang.interview.server.util;

import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Pattern;

@Slf4j
public class PDFUtil {

    private static final Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");

    public static byte[] loadPDFAsByteArray(String pdfUrl) throws Exception {
        URL url = new URL(pdfUrl);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setRequestMethod("GET");
        connection.setDoInput(true);
        connection.setUseCaches(false);
        connection.setConnectTimeout(10000);
        connection.setReadTimeout(10000);

        try (InputStream inputStream = connection.getInputStream();
             ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {

            byte[] buffer = new byte[1024];
            int bytesRead;
            while ((bytesRead = inputStream.read(buffer)) != -1) {
                byteArrayOutputStream.write(buffer, 0, bytesRead);
            }

            return byteArrayOutputStream.toByteArray();
        }
    }

    public static byte[] convertFileToByteArray(File file) {
        byte[] bytes = null;
        try (FileInputStream fis = new FileInputStream(file);
             ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
            byte[] buffer = new byte[1024];
            int bytesRead;
            while ((bytesRead = fis.read(buffer)) != -1) {
                bos.write(buffer, 0, bytesRead);
            }
            bytes = bos.toByteArray();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return bytes;
    }

    /**
     * 获取pdf的text
     */
    public static String getPdfText(String pdfUrl) {
        PDDocument document = null;
        String text = "";
        try {
            byte[] bytes = loadPDFAsByteArray(pdfUrl);
            document = Loader.loadPDF(bytes);
//            document = Loader.loadPDF(convertFileToByteArray(new File("D:\\下载\\resume.pdf")));
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setSortByPosition(true);
            stripper.setStartPage(0);
            stripper.setEndPage(Integer.MAX_VALUE);
            text = stripper.getText(document);
            text = pattern.matcher(text).replaceAll("");
            if (log.isInfoEnabled()) {
                log.info("提取的PDF文本: {}", text);
            }
        } catch (
                Exception e) {
            log.error("获取pdf转为文字错误:{}", e.getMessage(), e);
        } finally {
            if (document != null) {
                try {
                    document.close();
                } catch (Exception e) {
                    log.error("close error", e);
                }
            }
        }
        return text;
    }
}